#!/usr/bin/env perl
# by lhtk : lhtk80t7@gmail.com
#
# blastn and parse bls
#
# 将运行参数信息整合到 done.json 中
#
# 解析结束后，压缩 blast_results 文件夹
#
# 用了这么多 our，主要是为了提醒：它们在子函数里也使用了。
#
# 2017.4.11 不再提供 blastxml 解析，省点事。
#
use strict;
use warnings;

use autodie qw/open close/;
use Bio::SeqIO;
use Getopt::Long;

use Cwd qw/getcwd abs_path/;
# use File::Basename;

# 压缩、删除文件夹
use File::Find;
use Archive::Tar;
use File::Path qw/remove_tree/;
use File::Copy qw(move);

use JSON 'to_json';

BEGIN {
    if (defined $ENV{MOJO_HOME}) {
        unshift @INC, "$ENV{MOJO_HOME}/lib";
    } else {
        unshift @INC, "./lib";
    }
}

use ShareSubs qw/get_conf json2href update_done load_done/;

# -------------------------------------------
# mod_perl
our $home_dir = $ENV{MOJO_HOME};
$home_dir = '.' unless (defined $home_dir and -d $home_dir);
# ===========================================
# some settings
our $conf_dir = join '/', $home_dir, 'conf/b2dsc';
our $conf_dir_species = "$conf_dir/species";
# 先读基础配置
our $conf = get_conf(join '/', $conf_dir, 'b2dsc.json');
$conf->{known_probes} = get_conf(join '/', $conf_dir, "known_probes.json");

our $data_dir = join '/', $home_dir, $conf->{data_dir};
our $public_tmp_dir = join '/', $home_dir, $conf->{public_tmp_dir};
our $fasta_in = 'input.fa';
our $fasta_seq2filter = 'seq2filter.fa';
our $fasta_b2chrs = 'b2chrs.fa';
# https://www.ncbi.nlm.nih.gov/books/NBK279675/
# “blastn-short”, optimized for sequences less than 30 nucleotides.
our $short_seq_length = $conf->{blastn_short}; # short seq 界限值

# ===========================================
# command line args
# 
our $job_id;
# 什么情况下认为足够相似，以至于用同样的 blast results
our $pIdent_SM;
our $qCover_SM; # qcovhsp, 2019.1.17
my $disable_SM; # 是不是要 query seqs 之间相互比对？

# 或许应当添加错误检查，web 调用的时候不输出？还是输出到 log 里？
GetOptions(
    'jid|j=s' => \$job_id,
    'pIdent_SM|p=f' => \$pIdent_SM,     # optional
    'qCover_SM|q=f' => \$qCover_SM,     # optional
    'disable_SM|d' => \$disable_SM,     # flag, optional
) or die "Error in command line arguments!\n";

unless (defined $job_id) {
    print STDERR "Show me your job id!\n\n";
    exit 1;
}

$pIdent_SM = (defined $pIdent_SM and $pIdent_SM >=70 and $pIdent_SM <= 100) ? $pIdent_SM : 90; # 默认值为 90
$qCover_SM = (defined $qCover_SM and $qCover_SM >=70 and $qCover_SM <= 100) ? $qCover_SM : 90; # 默认值为 90

# 收集信息
our $info = load_done($job_id, $public_tmp_dir);
# ===========================================
our $species     = $info->{species};
load_conf_of_species($species);
our $chromosomes = $info->{chromosomes};
our $num_of_seqs = $info->{num_of_seqs};
our $ofmt = exists $info->{ofmt} ? $info->{ofmt} : $conf->{ofmt};

my $tmp_ofmt = $ofmt; # 拷贝一份
$tmp_ofmt =~ s/^\s*|\s*$//g;
our @bls_headers = split /\s+/, $tmp_ofmt;
shift @bls_headers; # 去掉 6

# 染色体对应 blastdb 中序列的名称，比如 3B : chr3B
# our $seqName2hgrpGenome = chrSeqName2hgrpGenome();

our $workdir = "$data_dir/$job_id";
unless (-d $workdir) {
    print STDERR "Error with 'jid': invalid!\n\n";
    exit 1;
}
$fasta_in = "$workdir/$fasta_in";
$fasta_seq2filter = "$workdir/$fasta_seq2filter";
$fasta_b2chrs = "$workdir/$fasta_b2chrs";

# -----------------------------------------------
# blastn
# 首先过滤掉 seq2filter.fa 中的相似序列
my $similar2seq2filter;
if (-e $fasta_seq2filter) {
    # 直接在函数里对 $fasta_in 文件进行了更新
    # 同时 $num_of_seqs--
    $similar2seq2filter = blastn2seq2filter();
    $info->{id_map}{seq2filter} = $similar2seq2filter if %$similar2seq2filter;
    $info->{num_of_seqs} = $num_of_seqs; # 更新

    # 没有序列的话，直接结束
    if ($num_of_seqs == 0) {
        $info->{status} = 'free'; # 运行结束，变更状态
        update_done($info, $job_id, $public_tmp_dir);
        exit 0;
    }
}
# 先是内部比对
my $similar_seqs = ((not $disable_SM) and $num_of_seqs > 1) ? inter_blastn() : {};
# 其次比对至 known probes
my $similar2known_probes = $disable_SM ? {} : blastn2known_probes();
my $id_map = get_id_map($similar_seqs, $similar2known_probes);
if (%$id_map) {
    while (my($k,$v) = each %$id_map) {
        $info->{id_map}{$k} = $v;
    }
}

our %minmax_of_seqs;
our %bestHits;
# 最后，比对至染色体
my $bls_dirs = blastn2chrs($id_map);

# parse blast results
# in .tsv format
our %tsv_files;
parse_blast_results($bls_dirs, $id_map) if (%$bls_dirs); # 如果为空，直接不用解析了
get_minmax_for_All(); # "All of them"

$info->{blastn} = \%tsv_files;
$info->{min} = $minmax_of_seqs{min};
$info->{max} = $minmax_of_seqs{max};
$info->{best_hits} = \%bestHits;

$info->{last_task} = $info->{status};
$info->{status} = 'free'; # 运行结束，变更状态

our $public_job_dir = "$public_tmp_dir/$job_id";
mkdir $public_job_dir unless -d $public_job_dir;

# 改为直接移动
targz_and_move4blast_results() if (%$bls_dirs);

# show blastn already done
update_done($info, $job_id, $public_tmp_dir);

# ===========================================
# subroutines
# ===========================================


# 根据 1-7A,3-6B 获取 blastdbs 
sub get_chromosome_dbs {
    our($conf, $species, $chromosomes);
    my %chr_dbs;

    # 获取 blastdb 位置信息
    my $species_db_conf = $conf->{species}{$species}{blastdb};

    for my $field (split /,/, $chromosomes) {
        my ($n1,$n2,$grp); # for 1-7A : n1 = 1, n2 = 7, grp = A
        # if ($field =~ /^(\d+)-(\d+)([a-zA-Z]+)$/) {
        # 允许这样子： 1-7B21
        if ($field =~ /^(\d+)-(\d+)([a-zA-Z]\w*)$/) {
            ($n1,$n2) = ($1 <= $2) ? ($1,$2) : ($2,$1); # e.g. 1,7
            $grp = $3;
        # } elsif ($field =~ /^(\d+)([a-zA-Z]+)$/) {
        } elsif ($field =~ /^(\d+)([a-zA-Z]\w*)$/) {
            ($n1,$n2) = ($1,$1);
            $grp = $2;
        }

        for my $n ($n1 .. $n2) {
            next unless (exists $species_db_conf->{chromosome}{$grp}{$n});

            my $cdb;
            my $db_dir = $species_db_conf->{db_dir};
            if ($db_dir =~ m#^/#) { # 绝对路径
                $cdb = join("/", $db_dir, $species_db_conf->{chromosome}{$grp}{$n}[0]);
            } else { # 相对路径
                $cdb = join("/", $home_dir, $db_dir, $species_db_conf->{chromosome}{$grp}{$n}[0]);
            }

            $chr_dbs{ $n . $grp } = $cdb;
        }
    }

    return \%chr_dbs;
}

# 将单独的 1 条序列 blastn 到若干 blast database
sub blastn1seq2dbs {
    my $sid = shift; # seq id
    my $blast_results_dir = shift;
    my $fasta = shift; # fasta file name
    my $seqLen = shift; # seq length
    # blastn 参数
    our($info, $short_seq_length);
    my $perc_identity = $info->{perc_identity};
    my $qcov_hsp_perc = $info->{qcov_hsp_perc};
    my $evalue        = $info->{evalue};
    my $evalue_short  = $info->{evalue_short};
    my($word_size, $word_size_short) = @{ $info }{qw/word_size word_size_short/};

    my $chr_dbs = get_chromosome_dbs(); # blastdb

    my $seq_bls_dir = $blast_results_dir . "/$sid";
    mkdir $seq_bls_dir;

    my $n_threads = $conf->{threads};
    # my $ofmt = $conf->{ofmt}; # 6 sseqid ...
    our $ofmt;

    for my $chr (keys %{ $chr_dbs }) {
        my $db = $chr_dbs->{$chr};
        # 假如考虑跨系统，需要修改这里
        my $command;
        if ($seqLen >= $short_seq_length) {
            # $command = "blastn -num_threads $n_threads -query $fasta -task blastn -db $db -out $seq_bls_dir/${sid}_$chr.bls -outfmt '$ofmt' -perc_identity $perc_identity -qcov_hsp_perc $qcov_hsp_perc -evalue $evalue -word_size $word_size";
            $command = "blastn -num_threads $n_threads -query $fasta -task blastn -db $db -outfmt '$ofmt' -perc_identity $perc_identity -qcov_hsp_perc $qcov_hsp_perc -evalue $evalue -word_size $word_size";
        } else {
            # $command = "blastn -num_threads $n_threads -query $fasta -task blastn-short -db $db -out $seq_bls_dir/${sid}_$chr.bls -outfmt '$ofmt' -perc_identity $perc_identity -qcov_hsp_perc $qcov_hsp_perc -evalue $evalue_short -word_size $word_size_short";
            $command = "blastn -num_threads $n_threads -query $fasta -task blastn-short -db $db -outfmt '$ofmt' -perc_identity $perc_identity -qcov_hsp_perc $qcov_hsp_perc -evalue $evalue_short -word_size $word_size_short";
        }
        $command .= ' -dust no' unless ($info->{dust});
        # system($command);
        run_blastn_and_change_sseqid($command, "$seq_bls_dir/${sid}_$chr.bls", $chr);
    }

    return $seq_bls_dir;
}

sub run_blastn_and_change_sseqid {
    my($cmd,$f_out,$cname) = @_;
    our @bls_headers;
    open my $fh, "$cmd |" or die $!;
    open my $fh_o, '>', $f_out;
    while (<$fh>) {
        chomp;
        next if /^\s*$/;
        my %tmp;
        @tmp{ @bls_headers } = split /\t/;
        $tmp{sseqid} = $cname; # change
        print $fh_o join("\t", @tmp{ @bls_headers }) . "\n";
    }
    close $fh or die $!;
    close $fh_o or die $!;
}

# 将多个序列逐次 blastn 至各个染色体 blastdb
# 返回 blast results 目录集
sub blastn2chrs {
    my $id_map = shift; # href
    our($workdir, $fasta_b2chrs, $conf, $species, $public_tmp_dir);
    our %tsv_files;
    our(%minmax_of_seqs, %bestHits);

    # 万一不存在该物种的 known_probes 呢？
    my $done_known_probes = {};
    if (exists $conf->{known_probes}{$species}) {
        $done_known_probes = load_done($conf->{known_probes}{$species}, $public_tmp_dir);
    }

    my %bls_dirs; # 存储每个探针序列 blast result 存储位置

    # 确定 blast results 放到哪里去
    my $blast_results_dir =  "$workdir/blast_results";
    mkdir $blast_results_dir;

    my $in = Bio::SeqIO->new(-file => "$fasta_b2chrs", -format => 'Fasta');
    my $tmp_fasta = "$blast_results_dir/tmp.fa"; # 临时 fasta 文件

    while (my $seq = $in->next_seq()) {
        my $sid = $seq->id();

        # 直接得到 tsv
        if ((exists $id_map->{known_probes}{$sid}) and %$done_known_probes) {
            my $kp_id = $id_map->{known_probes}{$sid};
            $tsv_files{$sid} = $done_known_probes->{blastn}{$kp_id};
            $minmax_of_seqs{min}{$sid} = $done_known_probes->{min}{$kp_id};
            $minmax_of_seqs{max}{$sid} = $done_known_probes->{max}{$kp_id};
            $bestHits{$sid} = $done_known_probes->{best_hits}{$kp_id};

            next;
        }

        my $out = Bio::SeqIO->new(-file => '>' . $tmp_fasta, -format => 'Fasta');
        $out->write_seq($seq);
        $bls_dirs{$sid} = blastn1seq2dbs($sid, $blast_results_dir, $tmp_fasta, $seq->length());
    }

    # 没专门比对到 chrs 上就不写了
    if (%bls_dirs) {
        # tabular 时添加一个说明文件
        # my $ofmt = $conf->{ofmt}; # 6 sseqid ...
        our $ofmt;
        if ($ofmt =~ /^6/) {
            my $str = $ofmt;
            $str =~ s/^6 //; $str =~ s/\s+/\t/g;
            open my $fh, '>', $blast_results_dir . '/specifiers.txt';
            print $fh "The specifiers for tabular blast results are:\n\n";
            print $fh $str . "\n";
            close $fh;
        }
    } else {
        rmdir $blast_results_dir;
    }
    return \%bls_dirs;
}

sub parse_blast_results {
    my $bls_dirs = shift;
    our %tsv_files;
    # our($workdir, $species, $conf);
    our($workdir, $species);
    our(%minmax_of_seqs, %bestHits);

    # 将解析出来的文件（.tsv 格式）放到名为 tsv 的文件夹下
    my $tsv_dir = "$workdir/tsv";
    mkdir $tsv_dir;

    # my $ofmt = $conf->{ofmt};
    our $ofmt;
    my $parse_bls;
    if ($ofmt =~ /^6/) {
        $parse_bls = \&parse_blastPlus_tab;
    } else {
        print STDERR "I don't know what to do ...\n";
        exit 1;
    }

    for my $sid (keys %$bls_dirs) {
        my $bls_dir = $bls_dirs->{$sid};
        # GAA_to_wheat.tsv
        # o - output
        my $o_tsv = "$tsv_dir/${sid}_to_${species}.tsv";
        $tsv_files{$sid} = $o_tsv;
        $tsv_files{$sid} =~ s#^\Q$home_dir\E/##; # 不保存 home_dir

        for my $f (<$bls_dir/*.bls>) { # 获得个最佳匹配
            next unless (-s $f) > 0;
            open my $fht, '<', $f;
            chomp(my $line = <$fht>);
            push @{ $bestHits{$sid} }, [ split /\t/, $line ];
            close $fht;
        }

        my $cmd; # command
        my $tmp_bls = "$bls_dir/all_bls.tmp";
        # 若考虑跨系统，需要修改这里
        $cmd = "cat $bls_dir/*.bls >$tmp_bls";
        system($cmd);
        my $minmax = $parse_bls->($tmp_bls, $o_tsv);
        $minmax_of_seqs{min}{$sid} = $minmax->{min};
        $minmax_of_seqs{max}{$sid} = $minmax->{max};

        unlink $tmp_bls;
    }
}

# 6 sseqid pident qcovhsp sstrand sstart send
sub parse_blastPlus_tab {
    my ($tab, $o_tsv) = @_; # file in, file out
    # our ($conf,$seqName2hgrpGenome);
    # our ($seqName2hgrpGenome);

    my %minmax = (
        min => {pident => 'NA', qcovhsp => 'NA',},
        max => {pident => 'NA', qcovhsp => 'NA',},
    );

    # my $ofmt = $conf->{ofmt};
    # our $ofmt;
    # my $tmp_ofmt = $ofmt; # 拷贝一份
    # $tmp_ofmt =~ s/^\s*|\s*$//g;
    # my @bls_headers = split /\s+/, $tmp_ofmt;
    # shift @bls_headers; # 去掉 6
    our @bls_headers;

    open my $fh_tsv, '>', $o_tsv;

    # sseqid pident qcovhsp sstrand sstart send
    my @fields = qw/sseqid pident qcovhsp sstrand sstart send/; # h - hit

    print $fh_tsv join("\t", @fields), "\n";

    if (-e $tab) {
        open my $fh_bls, '<', $tab;
        my %data = (); # +, - 各自存储排序，互不影响
        my %pre = (); # 上一个 hsp 的信息
        while (<$fh_bls>) {
            chomp;
            next if /^\s*$/;
            my %tmp;
            @tmp{ @bls_headers } = split /\t/;

            if (($minmax{min}{pident} eq 'NA') or ($tmp{pident} < $minmax{min}{pident})) {
                $minmax{min}{pident} = $tmp{pident};
            }
            if (($minmax{max}{pident} eq 'NA') or ($tmp{pident} > $minmax{max}{pident})) {
                $minmax{max}{pident} = $tmp{pident};
            }
            if (($minmax{min}{qcovhsp} eq 'NA') or ($tmp{qcovhsp} < $minmax{min}{qcovhsp})) {
                $minmax{min}{qcovhsp} = $tmp{qcovhsp};
            }
            if (($minmax{max}{qcovhsp} eq 'NA') or ($tmp{qcovhsp} > $minmax{max}{qcovhsp})) {
                $minmax{max}{qcovhsp} = $tmp{qcovhsp};
            }

            # $tmp{sseqid} = $seqName2hgrpGenome->{ $tmp{sseqid} };
            if (%pre and ($pre{sseqid} ne $tmp{sseqid}) ) {
                for my $s (sort keys %data) { # s - strand
                    @{ $data{$s} } = sort { $a->{sstart} <=> $b->{sstart} } @{ $data{$s} };
                    for my $d (@{ $data{$s} }) {
                        print $fh_tsv join("\t", @{ $d }{@fields}), "\n";
                    }
                }
                %data = ();
            }
            # 此前忘了交换起止，导致绘图不正确
            if ($tmp{sstrand} eq 'minus') {
                @tmp{qw/sstart send/} = @tmp{qw/send sstart/};
            }
            @pre{@fields} = @tmp{@fields};
            %tmp = %pre;
            push @{ $data{ $tmp{sstrand} } }, \%tmp;
        }
        close $fh_bls;

        if (%data) {
            for my $s (sort keys %data) {
                @{ $data{$s} } = sort { $a->{sstart} <=> $b->{sstart} } @{ $data{$s} };
                for my $d (@{ $data{$s} }) {
                    print $fh_tsv join("\t", @{ $d }{@fields}), "\n";
                }
            }
        }
    }
    close $fh_tsv;
    
    return \%minmax;
}

# 用 perl 模块实现
# 效率不如直接调用系统命令
sub archive_and_rmtree {
    # 必须是类似 blast_results 这样不带 / 的
    my $target_dir = shift; # 针对哪个目录操作？
    my $tar_gz_fname = $target_dir . '.tar.gz';

    our($workdir);
    my $c_wd = getcwd; # current workdir

    chdir $workdir;
    
    my $tar = Archive::Tar->new();
    my @files;
    find( sub { push @files, $File::Find::name }, $target_dir );
    unless (@files == 1) { # 如果空文件夹，就不压缩了
        $tar->add_files(@files);
        $tar->write($tar_gz_fname, COMPRESS_GZIP);
    }
    remove_tree($target_dir);

    chdir $c_wd;

    return join('/', $workdir, $tar_gz_fname);
}

# 输入序列内部比对，减少 blastn 运算量
# -------------------------------------
# 2016.12.8
# 先只管那些高度相似的 (highly_similar)
# 不考虑包含二体之类的 (part_of)
#
# 简单过滤，不考虑 blastn-short
sub inter_blastn {
    our($workdir, $fasta_in, $pIdent_SM, $qCover_SM);
    my %similar_seqs = ();

    # blast 2 seqs
    my $tmp_tsv = "$workdir/tmp_inter.tsv";
    # 6 qseqid sseqid pident qcovhsp qcovs sstrand sstart send
    # my $cmd = "blastn -task blastn -query $fasta_in -subject $fasta_in -outfmt '6 qseqid sseqid pident qcovs' -out $tmp_tsv -dust no -perc_identity $pIdent_SM";
    my $cmd = "blastn -task blastn -query $fasta_in -subject $fasta_in -outfmt '6 qseqid sseqid pident qcovhsp' -out $tmp_tsv -dust no -perc_identity $pIdent_SM";
    system($cmd);

    open my $fh, '<', $tmp_tsv;
    my %ids;
    while (<$fh>) {
        next if /^\s*$/;
        chomp;
        my($qseqid,$sseqid,$pident,$qcovhsp) = split /\t/;
        next if $qseqid eq $sseqid; # 自己比自己
        if ($pident >= $pIdent_SM and $qcovhsp >= $qCover_SM) {
            $ids{$qseqid}{$sseqid} = 1;
        }
    }
    close $fh;

    if (%ids) {
        my %skip;
        #while (my($k,$v) = each %ids) { # $k -> qseqid
                    # 这里用 each 不好，可能会造成错误
        for my $k (keys %ids) {
            next if exists $skip{$k};
            for my $sid (keys %{ $ids{$k} }) {
                if (exists $ids{$sid}{$k}) { # 同时存在，表示长度相似，整体高相似
                    $similar_seqs{$k}{$sid} = 1; # 真正高相似
                    $skip{$sid} = 1;
                }
            }
        }
    }

    unlink $tmp_tsv;

    return \%similar_seqs;
}

# 比对至 known probes
#
# 也是要求序列真正的长度相似，真正高相似
sub blastn2known_probes {
    our($conf, $species, $data_dir);
    # my %similar;

    # 直接用 fasta 来进行双序列比对，不用构建 blastdb
    if (not exists $conf->{known_probes}{$species}) {
        return {};
    }
    my $jid_kp = $conf->{known_probes}{$species};
    my $fasta_kp = join '/', $data_dir, $jid_kp, 'input.fa';
  
    # return blastn2fasta_file($fasta_kp);
    return find_a_highly_similar_seq4qseq($fasta_in, $fasta_kp);
}

# 2019.1.17 修改
# 提高要求，极其相似才行
# 因为比如单拷贝相似于多拷贝，但染色体分布情况还是会出现较大差异的。
sub blastn2fasta_file {
    # my $fasta_file = shift;
    my($fa_q, $fa_s) = @_; # query, subject
    
    # our($workdir, $fasta_in, $pIdent_SM, $qCover_SM);
    our($workdir, $pIdent_SM, $qCover_SM);
    my %similar = ();

    my $tmp_tsv = "$workdir/tmp_b2fasta_file.tsv";
    # my $cmd = "blastn -task blastn -query $fasta_in -subject $home_dir/$fasta_file -out $tmp_tsv -outfmt '6 qseqid sseqid pident qcovs length slen' -dust no -perc_identity $pIdent_SM";
    my $cmd = "blastn -task blastn -query $fa_q -subject $fa_s -out $tmp_tsv -outfmt '6 qseqid sseqid pident qcovhsp' -dust no -perc_identity $pIdent_SM";
    system($cmd);

    open my $fh, '<', $tmp_tsv;
    while (<$fh>) {
        next if /^\s*$/;
        chomp;
        # my($qseqid,$sseqid,$pident,$qcovs,$length,$slen) = split /\t/;
        my($qseqid,$sseqid,$pident,$qcovhsp) = split /\t/;
        # next if exists $similar{$qseqid};

        if ($pident >= $pIdent_SM and $qcovhsp >= $qCover_SM) {
            # $similar{$qseqid} = $sseqid;
            $similar{$qseqid}{$sseqid} = 1;
        }
    }
    close $fh;

    unlink $tmp_tsv;

    return \%similar;
}

sub find_a_highly_similar_seq4qseq {
    my($fa_q, $fa_s) = @_; # query, subject
    my %similar = ();
    # query blast to subject
    my $sim_q2s = blastn2fasta_file($fa_q, $fa_s);
    # subject blast to query
    my $sim_s2q = blastn2fasta_file($fa_s, $fa_q);

    while (my($q,$v) = each %$sim_q2s) {
        for my $s (keys %$v) {
            # 取一个即可
            if (exists $sim_s2q->{$s}{$q}) {
                $similar{$q} = $s;
                last;
            }
        }
    }
    return \%similar;
}

# 直接在这里把和 seq2filter 中序列相似的过滤掉
sub blastn2seq2filter {
    our($fasta_seq2filter, $fasta_in, $num_of_seqs);
    my $similar = find_a_highly_similar_seq4qseq($fasta_in, $fasta_seq2filter);
    # $similar = blastn2fasta_file($fasta_seq2filter);
   
    my $in =Bio::SeqIO->new(-file => $fasta_in, -format => 'Fasta');
    # 中转，直接写到变量里好了
    my $o_str;
    open my $str_o_fh, '>', \$o_str;
    my $stream_o = Bio::SeqIO->newFh(-fh => $str_o_fh, -format => 'Fasta');
    while (my $seq = $in->next_seq()) {
        my $id = $seq->id();
        if (exists $similar->{$id}) {
            $num_of_seqs--;
            next;
        }
        print $stream_o $seq;
    }
    close $str_o_fh;

    $o_str = '' unless $o_str;
    open my $fh, '>', $fasta_in;
    print $fh $o_str;
    close $fh;

    return $similar;
}

# 数一数有多少条序列
sub fasta_cnt {
    my $file = shift;

    open my $fh, '<', $file;
    my $cnt;
    while (<$fh>) {
        if (/^>/) {
            $cnt++;
        }
    }
    close $fh;

    return $cnt;
}

# 以后还可以考虑添加 part_of 什么的
sub get_id_map {
    my($sm_seqs, $sm2known_probes) = @_;
    my %map;
    our($fasta_in, $fasta_b2chrs);

    my %inter_sm; # 和其他序列相似的
    if (%$sm_seqs) {
        for my $q (keys %$sm_seqs) {
            for my $h (keys %{ $sm_seqs->{$q} }) {
                $map{inter}{$q}{$h} = 1;
                $inter_sm{$h} = 1;
            }
        }
    }
    if (%$sm2known_probes) {
        while (my($k,$v) = each %$sm2known_probes) {
            $map{known_probes}{$k} = $v;
        }
    }

    # 修改 fasta 文件
    if (%inter_sm) {
        my $in = Bio::SeqIO->new(-file => $fasta_in, -format => 'Fasta');
        my $out = Bio::SeqIO->new(-file => '>' . $fasta_b2chrs, -format => 'Fasta');
        while (my $seq = $in->next_seq()) {
            my $id = $seq->id();

            next if exists $inter_sm{$id};

            if (exists $map{inter}{$id}) {
                my $desc = $seq->desc();
                # highly_similar=xxx;rid=xxx;xxx
                $desc = join(';', 'highly_similar=' . join(',', sort keys %{ $map{inter}{$id} }), $desc);
                $seq->desc($desc);
            }
            $out->write_seq($seq);
        }
    } else {
        $fasta_b2chrs = $fasta_in; # 不再新建文件
    }

    return \%map;
}

# 染色体对应 blastdb 中序列的名称，比如 3B : chr3B
# 当前版本未使用此函数，2019.1.15 注释
# sub chrSeqName2dbName {
#     our($species,$conf);
#     my %maps;

#     my $href = $conf->{species}{$species}{blastdb}{chromosome};
#     for my $g (keys %$href) {
#         for my $n (keys %{ $href->{$g} }) {
#             my($db_name,$seq_name) = @{ $href->{$g}{$n} };
#             $maps{$seq_name} = $db_name;
#         }
#     }

#     return \%maps;
# }

# 比如对于短柄草 1 => 1Bd，使得更通用
# sub chrSeqName2hgrpGenome {
#     our($species,$conf);
#     my %maps;

#     my $href = $conf->{species}{$species}{blastdb}{chromosome};
#     for my $g (keys %$href) {
#         for my $n (keys %{ $href->{$g} }) {
#             my($db_name,$seq_name) = @{ $href->{$g}{$n} };
#             $maps{$seq_name} = $n . $g;
#         }
#     }

#     return \%maps;
# }

# All of them
sub get_minmax_for_All {
    our %minmax_of_seqs;
    my %minmax = (
        min => {pident => 'NA', qcovhsp => 'NA',},
        max => {pident => 'NA', qcovhsp => 'NA',},
    );

    while (my($k,$v) = each %{ $minmax_of_seqs{min} }) {
        if ($minmax{min}{pident} eq 'NA') {
            $minmax{min}{pident} = $v->{pident};
        } elsif ($v->{pident} ne 'NA' and $v->{pident} < $minmax{min}{pident}) {
            $minmax{min}{pident} = $v->{pident};
        }
        if ($minmax{min}{qcovhsp} eq 'NA') {
            $minmax{min}{qcovhsp} = $v->{qcovhsp};
        } elsif ($v->{qcovhsp} ne 'NA' and $v->{qcovhsp} < $minmax{min}{qcovhsp}) {
            $minmax{min}{qcovhsp} = $v->{qcovhsp};
        }
    }

    while (my($k,$v) = each %{ $minmax_of_seqs{max} }) {
        if (($minmax{max}{pident} eq 'NA') or ($v->{pident} > $minmax{max}{pident})) {
            $minmax{max}{pident} = $v->{pident};
        }
        if (($minmax{max}{qcovhsp} eq 'NA') or ($v->{qcovhsp} > $minmax{max}{qcovhsp})) {
            $minmax{max}{qcovhsp} = $v->{qcovhsp};
        }
    }
    $minmax_of_seqs{min}{'All of them'} = $minmax{min};
    $minmax_of_seqs{max}{'All of them'} = $minmax{max};
}

# 为 blast_results.tar.gz 创建一个软链接
# 当前版本未使用此函数，2019.1.15 注释
# sub targz_and_make_symlink4blast_results {
#     our($public_job_dir, $home_dir, $info);

#     my $tar_gz_fname = archive_and_rmtree('blast_results'); # 压缩，节省空间

#     # 试一下软链接
#     my $pub_tar_gz = "$public_job_dir/blast_results.tar.gz";
#     symlink abs_path($tar_gz_fname), abs_path($pub_tar_gz);
#     $pub_tar_gz =~ s#^\Q$home_dir\E/public##;
#     $info->{blast_results} = $pub_tar_gz;
# }

# 直接将 blast_results.tar.gz 移到 public/TMP 下
sub targz_and_move4blast_results {
    our($public_job_dir, $home_dir, $info);

    my $tar_gz_fname = archive_and_rmtree('blast_results'); # 压缩，节省空间

    my $pub_tar_gz = "$public_job_dir/blast_results.tar.gz";
    move $tar_gz_fname, $pub_tar_gz;
    $pub_tar_gz =~ s#^\Q$home_dir\E/public##;
    $info->{blast_results} = $pub_tar_gz;
}

sub load_conf_of_species {
    my $species = shift;
    # our $conf;
    my $f = join '/', $conf_dir_species, "$species.json";
    $conf->{species}{$species} = get_conf($f);

    return $conf;
}